from vllm import LLM, SamplingParams
import os

os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile_batch8"

# 指定设备为 CPU
model_path = os.curdir
llm = LLM(
    model=model_path,
    device="cpu",           # 关键参数
    dtype="float32",        # CPU 通常用 FP32
    gpu_memory_utilization=0,  # 禁用 GPU
    max_model_len = 8192,
    block_size = 16
)

llm.start_profile()

# 正常生成文本
sampling_params = SamplingParams(max_tokens=50)
outputs = llm.generate(["Who are you?"], sampling_params)

llm.stop_profile()

print(outputs[0].outputs[0].text)